作業目的: Data Visualization (02) Text

這份作業希望能夠讓你熟悉中文文字處理,並執行基本的文字相關分析,再將結果以圖表呈現。過程中會運用到過去幾週影片中的 document-level, word-level text analysis, regular expression, and text mining.

這次的作業使用維基文庫提供的歷任中華民國總統就職演說。因為總統就職演說本身代表了每一屆總統任期的,以其重要性,因此國內外媒體時常使用演說的內文當作素材,利用文字探勘的技巧寫出報導,以 2020 年為例,大家可以參考中央社的蔡總統關心什麼 文字會說話 以及 readr 的 少了「年輕人」多了「防疫」:臺灣歷屆民選總統就職演說字詞分析。國外的則可以參考 “I Have The Best Words.” Here’s How Trump’s First SOTU Compares To All The Others. by BuzzFeed, Word Aanalysis of 2016 Presidential debates - Clinton vs. Trump by Martin Krzywinski, and Trump used words like ‘invasion’ and ‘killer’ to discuss immigrants at rallies 500 times: USA TODAY analysis by USA today.

小小的反思:直接用資料、直接用斷詞結果(台灣 vs. 臺灣)可能會出錯喔!

作業: Data Visualization (02) Text

### 這邊不要動
library(tidyverse)
library(jiebaR)
library(tidytext)

df_speech <- read_csv("data/AS06/df_speech.csv")
### 給你看資料長這樣
df_speech %>% glimpse()
#> Rows: 15
#> Columns: 6
#> $ id        <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
#> $ term      <chr> "一", "二", "三", "四", "五", "六", "七", "八", "九", "十", "十一", "十二"…
#> $ year      <dbl> 1948, 1954, 1960, 1966, 1972, 1978, 1984, 1990, 1996, 2000, …
#> $ president <chr> "蔣中正", "蔣中正", "蔣中正", "蔣中正", "蔣中正", "蔣經國", "蔣經國", "李登輝", "李登輝…
#> $ title     <chr> "中華民國第一任總統就職演說總統 蔣中正1948年5月20日\n", "中華民國第二任總統就職演說總統 蔣中正1954年…
#> $ text      <chr> "  中正承國民大會依照憲法選舉為中華民國總統,擔任國家和人民的公僕,當此就職伊始,追念我 國父和先烈締造民國的艱難,省…

0. 斷詞:

請利用 library(jiebaR) 斷詞,過程中也要保留詞性的欄位。

### your code
### segment
cutter <- worker("tag", stop_word = "data/segment/df_stopword.txt")
vector_word = c("中華民國", "蔡英文", "李登輝", "蔣中正", "蔣經國", "李登輝", "陳水扁", "馬英九")
new_user_word(cutter, words = "data/segment/dict_jieba.txt")
new_user_word(cutter, words = "data/segment/hand.txt")
new_user_word(cutter, words = "data/segment/news.txt")
new_user_word(cutter, words = vector_word)
reg_space <- "%E3%80%80" %>% curl::curl_escape() 

### text part
df_speech_seg <-
  df_speech %>% 
  mutate(text = str_replace_all(text, "台灣|臺灣", "臺灣")) %>%
  mutate(text = str_remove_all(text, "\\n|\\r|\\t|:| | ")) %>%
  mutate(text = str_remove_all(text, reg_space)) %>%
  mutate(text = str_remove_all(text, "[a-zA-Z0-9]+")) %>%
  mutate(text_segment = purrr::map(text, function(x)segment(x, cutter))) %>%
  mutate(text_POS = purrr::map(text_segment, function(x)names(x)))
#> [1] TRUE
#> [1] TRUE
#> [1] TRUE
#> [1] TRUE

1. 整體熱門詞彙:

請先找出所有總統演說當中出現次數最高的 10 個詞彙,接著計算每屆總統演說時,這些詞彙出現的次數,最後將結果畫成圖表。因為斷詞結果會因為預先載入的詞典有所不同,所以底下的示意圖參考即可,請盡量呈現 有意義的 詞彙!

### your code
df_speech_seg_unnest <- df_speech_seg %>%
  unnest(c(text_segment, text_POS))

df_term_seg_count <- df_speech_seg_unnest %>% 
  count(id, term, year, text_segment, text_POS) %>%
  filter(str_length(text_segment) > 1)

df_seg_count_top <- df_term_seg_count %>% 
  group_by(text_segment, text_POS) %>% summarise(n = sum(n)) %>% 
  arrange(desc(n)) %>% ungroup() %>% filter(! text_segment %in% c("一個")) %>%
  slice(1:10) %>% select(text_segment)

df_term_seg_count %>% 
  inner_join(df_seg_count_top) %>%
  mutate(year = as.factor(year)) %>%
  ggplot(aes(x = year, y = text_segment, fill = n)) + geom_tile() +
  theme_bw() +
  scale_linetype(guide = "none") +
  scale_fill_gradient(low = "white", high = "red")+
  labs(x= "年份",y= "詞彙", title = "歷屆總統演說使用熱詞", fill = "次數") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(legend.position="bottom") +
  theme(plot.title = element_text(family = "Noto Sans CJK TC Medium", face = "plain", size = 18),
        legend.text = element_text(family = "Noto Sans CJK TC Medium", face = "plain"), 
        text = element_text(family = "Noto Sans CJK TC Medium"))

### your result should be
# 自己畫就好唷

2. 各自熱門詞彙:

請先找出各個總統演說中,出現次數最高的 10 個詞彙,並且將結果畫成圖表。因為斷詞結果會因為預先載入的詞典有所不同,所以底下的示意圖參考即可,請盡量呈現 有意義的 詞彙!

### your code
df_president_seg_count <- df_speech_seg_unnest %>% 
  count(president, text_segment, text_POS) %>%
  filter(str_length(text_segment) > 1)

df_president_seg_count_top <- df_president_seg_count %>% group_by(president) %>%
  arrange(president, desc(n)) %>% mutate(rn = row_number()) %>%
  filter(rn <= 10) %>% ungroup() %>%
  group_by(president) %>% arrange(president, n) %>% ungroup() %>%
  mutate(president = fct_relevel(as.factor(president), "蔣中正", "蔣經國", "李登輝", "陳水扁", "馬英九", "蔡英文"))

df_president_seg_count_top %>%
  mutate(text_segment = reorder_within(text_segment, n, president)) %>%
  ggplot(aes(x = text_segment, y = n)) + geom_col() +
  facet_wrap(president ~ ., scales = "free") +
  coord_flip() +
  theme_bw() +
  scale_linetype(guide = "none") +
  scale_x_reordered() +
  scale_fill_gradient(low = "white", high = "red")+
  labs(x= "年份",y= "詞彙", title = "歷屆總統演說使用熱詞", fill = "次數") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(legend.position="bottom") +
  theme(plot.title = element_text(family = "Noto Sans CJK TC Medium", face = "plain", size = 18),
        legend.text = element_text(family = "Noto Sans CJK TC Medium", face = "plain"), 
        text = element_text(family = "Noto Sans CJK TC Medium"))

### your result should be
# 自己畫就好唷

3. TF-IDF:

請先篩掉各個總統演說中出現次數小於 5 的詞彙,接著計算 TF-IDF (不知道這是什麼的話請看老師影片!),最後將結果畫成圖表。因為斷詞結果會因為預先載入的詞典有所不同,所以底下的示意圖參考即可,請盡量呈現 有意義的 詞彙!

### your code
df_president_tfidf <- df_president_seg_count %>% filter(n > 5) %>%
  bind_tf_idf(text_segment, president, n) %>%
  group_by(president) %>% arrange(-tf_idf) %>% 
  slice(1:10) %>% ungroup() %>%
  mutate(president = fct_relevel(as.factor(president), "蔣中正", "蔣經國", "李登輝", "陳水扁", "馬英九", "蔡英文")) %>%
  mutate(text_segment = fct_reorder(text_segment, tf_idf))

df_president_tfidf %>%
  mutate(text_segment = reorder_within(text_segment, tf_idf, president)) %>%
  ggplot(aes(x = text_segment, y = tf_idf)) + geom_col() +
  facet_wrap(president ~ ., scales = "free") +
  coord_flip() +
  theme_bw() +
  scale_x_reordered() +
  labs(x= "年份",y= "詞彙", title = "歷屆總統演說使用熱詞", fill = "次數") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(legend.position="bottom") +
  theme(plot.title = element_text(family = "Noto Sans CJK TC Medium", face = "plain", size = 18),
        legend.text = element_text(family = "Noto Sans CJK TC Medium", face = "plain"), 
        text = element_text(family = "Noto Sans CJK TC Medium"))

### your result should be
# 自己畫就好唷

4. 捉對廝殺:

請先留下蔡英文和馬英九的用詞,接著計算兩者用詞數量差異最大各自前十名的詞彙,最後將結果畫成圖表。因為斷詞結果會因為預先載入的詞典有所不同,所以底下的示意圖參考即可,請盡量呈現 有意義的 詞彙!

### your code
df_ying_seg_count <- df_speech_seg_unnest %>% 
  filter(president %in% c("馬英九", "蔡英文")) %>%
  count(president, text_segment) %>%
  filter(str_length(text_segment) > 1)

df_ying_seg_diff <- df_ying_seg_count %>% 
  pivot_wider(names_from = president, values_from = n, values_fill = list(n = 0)) %>%
  mutate(diff_tsai = `蔡英文` - `馬英九`, diff_ma = -diff_tsai)

df_ying_seg_diff %>% arrange(desc(diff_tsai)) %>% slice(1:10) %>%
  select(text_segment, diff = diff_tsai) %>% mutate(president = "蔡英文") %>%
  bind_rows(
    df_ying_seg_diff %>% arrange(desc(diff_ma)) %>% slice(1:10) %>%
      select(text_segment, diff = diff_ma) %>% mutate(president = "馬英九")  
  ) %>%
  mutate(diff2 = if_else(president == "馬英九", -diff, diff)) %>%
  mutate(text_segment = reorder(text_segment, diff2)) %>%
  ggplot(aes(x = diff2, y = text_segment, fill = president)) + geom_col() +
  theme_bw() +
  scale_x_continuous(limits = c(-50, 50)) +
  scale_fill_manual(values = c("#1B9431", "#000095")) +
  labs(x= "次數",y= "詞彙", title = "雙英對決:馬英九與蔡英文使用次數差異最大詞彙", fill = "總統") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(legend.position="bottom") +
  theme(plot.title = element_text(family = "Noto Sans CJK TC Medium", face = "plain", size = 18),
        legend.text = element_text(family = "Noto Sans CJK TC Medium", face = "plain"), 
        text = element_text(family = "Noto Sans CJK TC Medium"))

### your result should be
# 自己畫就好唷